# download_dp_issue_page_live.py
# DP (Data & Policy) Downloader
# -------------------------------------------------
# Automates downloading PDFs from Cambridge Core *Data & Policy* issue pages
# - Parses article titles and direct PDF links from the provided live issue page URL
# - Supports paginated issues by running separately for each page (20 articles max per page)
# - Handles absolute and relative PDF URLs with urljoin for reliability
# - Skips non-PDF entries automatically
# - Saves each PDF with sanitized filenames for cross-platform compatibility
# - Creates dynamic folder names like DP_Vol3_2022_p1 from issue metadata
# - Logs all downloads and skipped items to CSV
# - Reusable for other Cambridge Core journals with similar structure
# Data & Policy (Cambridge Core) — Issue Page Downloader (paste URL)
# -------------------------------------------------
# • Prompts for a single Cambridge Core volume page URL (per page)
# • Extracts Volume (zero-padded) + Year robustly from the header
# • Appends page number p{N} from the URL to the folder
# • Scrapes each article card for Title + direct PDF link
# • Downloads PDFs into: DP_Vol{vol}_{year}_p{page}
#
# Examples (per page):
#   https://www.cambridge.org/core/journals/data-and-policy/volume/568D44898D353A111834DB015F9EA9A3
#   https://www.cambridge.org/core/journals/data-and-policy/volume/568D44898D353A111834DB015F9EA9A3?sort=canonical.position%3Aasc&pageNum=2&searchWithinIds=568D44898D353A111834DB015F9EA9A3&productType=JOURNAL_ARTICLE&template=cambridge-core%2Fjournal%2Farticle-listings%2Flistings-wrapper&hideArticleJournalMetaData=true&displayNasaAds=false
#




import os
import re
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs

BASE = "https://www.cambridge.org"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    )
}
TIMEOUT = 60

def sanitize_filename(name: str) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"\.+", ".", name).strip(". ")
    return name[:180]

def get(url: str, referer: str | None = None) -> requests.Response:
    headers = dict(HEADERS)
    if referer:
        headers["Referer"] = referer
    r = requests.get(url, headers=headers, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def soup_for(url: str) -> BeautifulSoup:
    r = get(url)
    return BeautifulSoup(r.text, "html.parser")

def parse_volume_year(soup: BeautifulSoup) -> tuple[str, str]:
    """
    Robustly extract (volume, year) from the page header.
    - Primary: <h2 class="heading_07"> … Volume <span class="volume">N</span> - <span class="issue-date">YYYY</span> … </h2>
    - If that varies, regex within the header text; fallback to <title>.
    Volume is zero-padded to 2 digits. Year returns 'Year' only if truly absent.
    """
    def norm(txt: str) -> str:
        return re.sub(r"\s+", " ", (txt or "").replace("\xa0", " ")).strip()

    vol, yr = "", ""
    h2 = soup.find("h2", class_=re.compile(r"\bheading_07\b"))
    if h2:
        vol_el = h2.find("span", class_=re.compile(r"\bvolume\b"))
        yr_el  = h2.find("span", class_=re.compile(r"\bissue-date\b"))
        if vol_el:
            vol = norm(vol_el.get_text())
        if yr_el:
            # Extract just a 4-digit year from that span
            m = re.search(r"(19|20)\d{2}", yr_el.get_text())
            if m:
                yr = m.group(0)
        # Fallbacks via whole header text
        h2_text = norm(h2.get_text())
        if not yr:
            m = re.search(r"(19|20)\d{2}", h2_text)
            if m:
                yr = m.group(0)
        if not vol:
            m = re.search(r"Volume\s+(\d+)", h2_text, flags=re.I)
            if m:
                vol = m.group(1)

    # Final fallbacks via <title>
    if (not vol or not yr) and soup.title:
        title_txt = norm(soup.title.get_text())
        if not vol:
            m = re.search(r"Volume\s+(\d+)", title_txt, flags=re.I)
            if m:
                vol = m.group(1)
        if not yr:
            m = re.search(r"(19|20)\d{2}", title_txt)
            if m:
                yr = m.group(0)

    # Defaults + zero-pad
    if not vol:
        vol = "00"
    if vol.isdigit():
        vol = vol.zfill(2)
    if not yr:
        yr = "Year"

    return vol, yr

def get_page_number_from_url(url: str) -> str:
    qs = parse_qs(urlparse(url).query)
    # Cambridge uses pageNum=2 for subsequent pages; default => "1"
    return (qs.get("pageNum", ["1"])[0] or "1")

def collect_cards(soup: BeautifulSoup) -> list[dict]:
    """
    Each article card lives under .product-listing-with-inputs-content.
    Title: .details li.title h3 a.part-link[href]
    PDF : anchor whose href contains '/core/services/aop-cambridge-core/content/view/' and ends with '.pdf'
    """
    items = []
    cards = soup.select(".product-listing-with-inputs-content")
    for card in cards:
        a_title = card.select_one(".details li.title h3 a.part-link[href]")
        if not a_title:
            continue
        title = a_title.get_text(" ", strip=True)
        href = (a_title.get("href") or "").strip()
        article_url = urljoin(BASE, href) if href.startswith("/") else href

        pdf_url = ""
        for a in card.select("a[href]"):
            ahref = (a.get("href") or "").strip()
            if "/core/services/aop-cambridge-core/content/view/" in ahref and ahref.lower().endswith(".pdf"):
                pdf_url = urljoin(BASE, ahref) if ahref.startswith("/") else ahref
                break

        items.append({
            "title": title,
            "article_url": article_url,
            "pdf_url": pdf_url
        })
    return items

def ensure_pdf(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    return ("pdf" in ctype) or (resp.content[:5] == b"%PDF-")

if __name__ == "__main__":
    issue_url = input("Paste Data & Policy volume page URL (per page): ").strip()
    if not issue_url:
        print("No URL provided. Exiting.")
        raise SystemExit(1)

    print(f"[INFO] Fetching page…")
    soup = soup_for(issue_url)

    vol, year = parse_volume_year(soup)
    page_num = get_page_number_from_url(issue_url)
    outdir = f"DP_Vol{vol}_{year}_p{page_num}"
    os.makedirs(outdir, exist_ok=True)
    print(f"[INFO] Output folder: {outdir}")

    items = collect_cards(soup)
    print(f"[INFO] Found {len(items)} article cards on this page")

    saved = 0
    for i, it in enumerate(items, 1):
        title = sanitize_filename(it["title"])
        article_url = it["article_url"]
        pdf_url = it["pdf_url"]

        if not pdf_url:
            print(f"[{i}] ⚠️ No PDF link for: {title}")
            continue

        try:
            resp = get(pdf_url, referer=article_url)
            if not ensure_pdf(resp):
                print(f"[{i}] ❌ Not a PDF response: {title}")
                continue

            path = os.path.join(outdir, f"{title}.pdf")
            with open(path, "wb") as f:
                f.write(resp.content)
            print(f"[{i}] ✅ Saved: {title}.pdf")
            saved += 1
            time.sleep(0.5)  # polite pause
        except Exception as e:
            print(f"[{i}] ❌ Error: {e}")

    print(f"\nDone! {saved} PDFs saved in {outdir}")
